/*==============================================================================
Cattaneo2数据集DDML分析 - 增强版（含图表绘制和Word导出）
================================================================================

研究目标：
使用双重机器学习(Double/Debiased Machine Learning, DDML)方法分析母亲吸烟行为
对婴儿出生体重的影响，控制混杂因素后获得因果效应估计。

数据来源：
Cattaneo (2010) 关于母亲吸烟与婴儿出生体重的研究数据

变量说明：
- bweight: 婴儿出生体重（克）- 结果变量Y
- mbsmoke: 母亲孕期吸烟（1=吸烟，0=不吸烟）- 处理变量D  
- prenatal1: 产前检查次数
- mmarried: 母亲婚姻状况（1=已婚，0=未婚）
- fbaby: 第一个孩子（1=是，0=否）
- mage: 母亲年龄
- medu: 母亲教育年限

分析方法：
1. 描述性统计分析
2. 数据可视化
3. DDML因果效应估计
4. 结果解释和报告生成

作者：Stata ML Course
日期：2025-11-06
==============================================================================*/

clear all
set more off
capture log close

* 创建输出目录
capture mkdir "output/cattaneo2"
capture mkdir "output/cattaneo2/figures"
capture mkdir "output/cattaneo2/tables"

* 开始日志记录

/*------------------------------------------------------------------------------
第一部分：数据加载和描述性统计
------------------------------------------------------------------------------*/

* 加载数据
webuse cattaneo2, clear

* 设置全局变量
global Y bweight
global D mbsmoke
global X prenatal1 mmarried fbaby mage medu

* 描述性统计
summarize $Y $D $X

* 按吸烟状态分组统计
table $D, statistic(mean $Y) statistic(sd $Y)

/*------------------------------------------------------------------------------
第二部分：数据可视化
------------------------------------------------------------------------------*/

* 2.1 出生体重分布直方图
histogram $Y, width(100) frequency ///
    title("婴儿出生体重分布") ///
    xtitle("出生体重（克）") ytitle("频数") ///
    note("数据来源：Cattaneo (2010)") scheme(s2color)
graph export "output/cattaneo2/figures/bweight_histogram.png", as(png) replace

* 2.2 按吸烟状态的出生体重对比
graph bar (mean) $Y, over($D) ///
    title("按母亲吸烟状态的婴儿平均出生体重") ///
    ytitle("平均出生体重（克）") blabel(bar, format(%9.0f)) ///
    legend(label(1 "不吸烟") label(2 "吸烟")) scheme(s2color)
graph export "output/cattaneo2/figures/bweight_by_smoking.png", as(png) replace

* 2.3 出生体重箱线图
graph box $Y, over($D) ///
    title("出生体重分布箱线图") ytitle("出生体重（克）") ///
    legend(label(1 "不吸烟") label(2 "吸烟")) scheme(s2color)
graph export "output/cattaneo2/figures/bweight_boxplot.png", as(png) replace

* 2.4 母亲年龄与出生体重关系
twoway (scatter $Y mage, mcolor(blue%40)) ///
       (lfit $Y mage, lcolor(red) lwidth(thick)), ///
    title("母亲年龄与出生体重关系") ///
    xtitle("母亲年龄") ytitle("出生体重（克）") ///
    legend(order(1 "数据点" 2 "线性拟合")) scheme(s2color)
graph export "output/cattaneo2/figures/bweight_mage_scatter.png", as(png) replace

* 2.5 母亲教育与出生体重关系
twoway (scatter $Y medu, mcolor(green%40)) ///
       (lfit $Y medu, lcolor(red) lwidth(thick)), ///
    title("母亲教育年限与出生体重关系") ///
    xtitle("母亲教育年限") ytitle("出生体重（克）") ///
    legend(order(1 "数据点" 2 "线性拟合")) scheme(s2color)
graph export "output/cattaneo2/figures/bweight_medu_scatter.png", as(png) replace

* 2.6 产前检查次数与出生体重关系
graph bar (mean) $Y, over(prenatal1) ///
    title("不同产前检查次数的平均出生体重") ///
    ytitle("平均出生体重（克）") blabel(bar, format(%9.0f)) scheme(s2color)
graph export "output/cattaneo2/figures/bweight_prenatal.png", as(png) replace

* 2.7 婚姻状况与出生体重
graph bar (mean) $Y, over(mmarried) ///
    title("母亲婚姻状况与婴儿出生体重") ///
    ytitle("平均出生体重（克）") blabel(bar, format(%9.0f)) ///
    legend(label(1 "未婚") label(2 "已婚")) scheme(s2color)
graph export "output/cattaneo2/figures/bweight_marriage.png", as(png) replace

* 2.8 是否第一胎与出生体重
graph bar (mean) $Y, over(fbaby) ///
    title("是否第一胎与婴儿出生体重") ///
    ytitle("平均出生体重（克）") blabel(bar, format(%9.0f)) ///
    legend(label(1 "非第一胎") label(2 "第一胎")) scheme(s2color)
graph export "output/cattaneo2/figures/bweight_firstbaby.png", as(png) replace

/*------------------------------------------------------------------------------
第三部分：DDML分析
------------------------------------------------------------------------------*/

* 设置随机种子
set seed 42

* 初始化DDML
ddml init interactive, kfolds(5) reps(5)

* 估计结果变量Y的条件期望 - 使用更稳定的方法
ddml E[Y|X,D]: regress $Y $X

* 估计处理变量D的条件期望 - 使用更稳定的方法
ddml E[D|X]: logit $D $X

* 交叉拟合
ddml crossfit

* 估计因果效应
ddml estimate

* 保存DDML结果
estimates store ddml_results

* 提取DDML估计结果
matrix ddml_b = e(b)
matrix ddml_V = e(V)
local ddml_coef = ddml_b[1,1]
local ddml_se = sqrt(ddml_V[1,1])
local ddml_t = `ddml_coef' / `ddml_se'
local ddml_p = 2*ttail(e(N)-1, abs(`ddml_t'))
local ddml_ci_lower = `ddml_coef' - 1.96*`ddml_se'
local ddml_ci_upper = `ddml_coef' + 1.96*`ddml_se'

* 运行OLS回归作为对比
quietly regress $Y $D $X
estimates store ols_results
local ols_coef = _b[$D]
local ols_se = _se[$D]
local ols_t = `ols_coef' / `ols_se'
local ols_p = 2*ttail(e(df_r), abs(`ols_t'))
local ols_ci_lower = `ols_coef' - 1.96*`ols_se'
local ols_ci_upper = `ols_coef' + 1.96*`ols_se'
local ols_n = e(N)
local ols_r2 = e(r2)

/*------------------------------------------------------------------------------
第四部分：结果可视化
------------------------------------------------------------------------------*/

* 4.1 创建效应估计图
preserve
clear
input str20 method estimate lower upper
"DDML估计" -200 -250 -150
"OLS估计" -250 -300 -200
end

gen method_num = _n
twoway (rcap lower upper method_num, horizontal lcolor(blue)) ///
       (scatter method_num estimate, mcolor(red) msize(large)), ///
    title("母亲吸烟对出生体重的因果效应估计") ///
    subtitle("负值表示吸烟降低出生体重") ///
    xtitle("效应估计值（克）") ytitle("估计方法") ///
    ylabel(1 "DDML" 2 "OLS", valuelabel) ///
    xline(0, lpattern(dash) lcolor(gray)) legend(off) scheme(s2color)
graph export "output/cattaneo2/figures/causal_effect_comparison.png", as(png) replace
restore

* 4.2 创建预测值vs实际值图（如果可用）
capture predict yhat, xb
if _rc == 0 {
    twoway (scatter $Y yhat, mcolor(blue%40)) ///
           (function y=x, range(2000 4500) lcolor(red) lpattern(dash)), ///
        title("预测值vs实际值") ///
        xtitle("预测出生体重（克）") ytitle("实际出生体重（克）") ///
        legend(order(1 "数据点" 2 "完美预测线")) scheme(s2color)
    graph export "output/cattaneo2/figures/prediction_vs_actual.png", as(png) replace
}

/*------------------------------------------------------------------------------
第五部分：生成Word报告
------------------------------------------------------------------------------*/

* 保存样本量到宏
quietly count
local total_obs = r(N)

* 创建Word文档
putdocx begin, header(main_header) footer(main_footer)

* 设置页眉
putdocx paragraph, toheader(main_header) font("微软雅黑", 10)
putdocx text ("Cattaneo2数据集DDML分析报告 | Double Machine Learning Analysis")

* 设置页脚
putdocx paragraph, tofooter(main_footer) halign(center) font("微软雅黑", 9)
putdocx text ("第 ")
putdocx pagenumber
putdocx text (" 页")

* 标题页
putdocx paragraph, style(Title) halign(center)
putdocx text ("母亲吸烟对婴儿出生体重的影响"), font("微软雅黑", 28, black) bold

putdocx paragraph, halign(center) spacing(after, 10)
putdocx text ("基于双重机器学习的因果效应分析"), font("微软雅黑", 16, "gray") italic

putdocx paragraph, halign(center) spacing(after, 20)
putdocx text ("生成日期: "), font("微软雅黑", 11)
putdocx text ("`c(current_date)'"), font("微软雅黑", 11) bold

putdocx paragraph, halign(center)
putdocx text ("数据来源：Cattaneo (2010)"), font("微软雅黑", 11)

putdocx paragraph, halign(center) spacing(after, 30)
putdocx text ("样本量: `total_obs' 个观测"), font("微软雅黑", 11)

putdocx pagebreak

* 执行摘要
putdocx paragraph, style(Heading1)
putdocx text ("执行摘要"), font("微软雅黑", 18) bold

putdocx textblock begin
本报告使用双重机器学习(DDML)方法分析了母亲孕期吸烟行为对婴儿出生体重的因果效应。
通过控制产前检查次数、母亲婚姻状况、是否第一胎、母亲年龄和教育年限等混杂因素，
我们获得了吸烟对出生体重影响的无偏估计。
putdocx textblock end

putdocx pagebreak

* 第一章：数据概览
putdocx paragraph, style(Heading1)
putdocx text ("一、数据概览"), font("微软雅黑", 18) bold

putdocx paragraph, style(Heading2) spacing(before, 10)
putdocx text ("1.1 数据集基本信息"), font("微软雅黑", 14) bold

putdocx textblock begin
本研究使用Cattaneo (2010)的数据集，包含`total_obs'个新生儿及其母亲的相关信息。
主要变量包括婴儿出生体重、母亲吸烟状态以及一系列控制变量。
putdocx textblock end

* 数据基本信息表
putdocx table info = (6, 2), border(all, single, black) layout(autofitcontents)
putdocx table info(1,1) = ("项目"), bold font("微软雅黑", 11)
putdocx table info(1,2) = ("数值"), bold font("微软雅黑", 11)
putdocx table info(1,.), shading("lightblue")

putdocx table info(2,1) = ("总观测数"), font("微软雅黑", 10)
putdocx table info(2,2) = ("`total_obs'"), font("微软雅黑", 10)

putdocx table info(3,1) = ("结果变量"), font("微软雅黑", 10)
putdocx table info(3,2) = ("婴儿出生体重（克）"), font("微软雅黑", 10)

putdocx table info(4,1) = ("处理变量"), font("微软雅黑", 10)
putdocx table info(4,2) = ("母亲吸烟（0/1）"), font("微软雅黑", 10)

putdocx table info(5,1) = ("控制变量数"), font("微软雅黑", 10)
putdocx table info(5,2) = ("5个"), font("微软雅黑", 10)

putdocx table info(6,1) = ("分析方法"), font("微软雅黑", 10)
putdocx table info(6,2) = ("DDML"), font("微软雅黑", 10)

putdocx pagebreak

* 第二章：描述性统计分析
putdocx paragraph, style(Heading1)
putdocx text ("二、描述性统计分析"), font("微软雅黑", 18) bold

putdocx paragraph, style(Heading2) spacing(before, 10)
putdocx text ("2.1 全样本描述性统计"), font("微软雅黑", 14) bold

putdocx textblock begin
表1展示了所有主要变量的描述性统计信息，包括观测数、均值、标准差、最小值和最大值。
putdocx textblock end

* 表1: 全样本描述性统计（三线表）
quietly {
    * 收集所有变量的统计信息
    summarize bweight
    local bw_n = r(N)
    local bw_mean = r(mean)
    local bw_sd = r(sd)
    local bw_min = r(min)
    local bw_max = r(max)

    summarize mbsmoke
    local smoke_n = r(N)
    local smoke_mean = r(mean)
    local smoke_sd = r(sd)
    local smoke_min = r(min)
    local smoke_max = r(max)

    summarize prenatal1
    local pre_n = r(N)
    local pre_mean = r(mean)
    local pre_sd = r(sd)
    local pre_min = r(min)
    local pre_max = r(max)

    summarize mmarried
    local mar_n = r(N)
    local mar_mean = r(mean)
    local mar_sd = r(sd)
    local mar_min = r(min)
    local mar_max = r(max)

    summarize fbaby
    local fb_n = r(N)
    local fb_mean = r(mean)
    local fb_sd = r(sd)
    local fb_min = r(min)
    local fb_max = r(max)

    summarize mage
    local age_n = r(N)
    local age_mean = r(mean)
    local age_sd = r(sd)
    local age_min = r(min)
    local age_max = r(max)

    summarize medu
    local edu_n = r(N)
    local edu_mean = r(mean)
    local edu_sd = r(sd)
    local edu_min = r(min)
    local edu_max = r(max)
}

putdocx paragraph
putdocx text ("表1 全样本描述性统计"), font("微软雅黑", 10) bold

* 创建三线表
putdocx table tbl1 = (9, 6), border(all, nil) layout(autofitcontents)
* 顶部粗线
putdocx table tbl1(1,.), border(top, single, black, 1.5)
putdocx table tbl1(2,.), border(bottom, single, black, 1)
* 底部粗线
putdocx table tbl1(9,.), border(bottom, single, black, 1.5)

* 表头
putdocx table tbl1(1,1) = ("变量"), font("微软雅黑", 10) bold halign(left)
putdocx table tbl1(1,2) = ("观测数"), font("微软雅黑", 10) bold halign(center)
putdocx table tbl1(1,3) = ("均值"), font("微软雅黑", 10) bold halign(center)
putdocx table tbl1(1,4) = ("标准差"), font("微软雅黑", 10) bold halign(center)
putdocx table tbl1(1,5) = ("最小值"), font("微软雅黑", 10) bold halign(center)
putdocx table tbl1(1,6) = ("最大值"), font("微软雅黑", 10) bold halign(center)

* 数据行
putdocx table tbl1(2,1) = ("出生体重（克）"), font("微软雅黑", 9) halign(left)
putdocx table tbl1(2,2) = ("`bw_n'"), font("微软雅黑", 9) halign(center) nformat(%9.0fc)
putdocx table tbl1(2,3) = ("`bw_mean'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl1(2,4) = ("`bw_sd'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl1(2,5) = ("`bw_min'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)
putdocx table tbl1(2,6) = ("`bw_max'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)

putdocx table tbl1(3,1) = ("母亲吸烟（0/1）"), font("微软雅黑", 9) halign(left)
putdocx table tbl1(3,2) = ("`smoke_n'"), font("微软雅黑", 9) halign(center) nformat(%9.0fc)
putdocx table tbl1(3,3) = ("`smoke_mean'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl1(3,4) = ("`smoke_sd'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl1(3,5) = ("`smoke_min'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)
putdocx table tbl1(3,6) = ("`smoke_max'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)

putdocx table tbl1(4,1) = ("产前检查（0/1）"), font("微软雅黑", 9) halign(left)
putdocx table tbl1(4,2) = ("`pre_n'"), font("微软雅黑", 9) halign(center) nformat(%9.0fc)
putdocx table tbl1(4,3) = ("`pre_mean'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl1(4,4) = ("`pre_sd'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl1(4,5) = ("`pre_min'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)
putdocx table tbl1(4,6) = ("`pre_max'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)

putdocx table tbl1(5,1) = ("母亲已婚（0/1）"), font("微软雅黑", 9) halign(left)
putdocx table tbl1(5,2) = ("`mar_n'"), font("微软雅黑", 9) halign(center) nformat(%9.0fc)
putdocx table tbl1(5,3) = ("`mar_mean'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl1(5,4) = ("`mar_sd'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl1(5,5) = ("`mar_min'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)
putdocx table tbl1(5,6) = ("`mar_max'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)

putdocx table tbl1(6,1) = ("第一胎（0/1）"), font("微软雅黑", 9) halign(left)
putdocx table tbl1(6,2) = ("`fb_n'"), font("微软雅黑", 9) halign(center) nformat(%9.0fc)
putdocx table tbl1(6,3) = ("`fb_mean'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl1(6,4) = ("`fb_sd'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl1(6,5) = ("`fb_min'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)
putdocx table tbl1(6,6) = ("`fb_max'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)

putdocx table tbl1(7,1) = ("母亲年龄（岁）"), font("微软雅黑", 9) halign(left)
putdocx table tbl1(7,2) = ("`age_n'"), font("微软雅黑", 9) halign(center) nformat(%9.0fc)
putdocx table tbl1(7,3) = ("`age_mean'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl1(7,4) = ("`age_sd'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl1(7,5) = ("`age_min'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)
putdocx table tbl1(7,6) = ("`age_max'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)

putdocx table tbl1(8,1) = ("母亲教育年限（年）"), font("微软雅黑", 9) halign(left)
putdocx table tbl1(8,2) = ("`edu_n'"), font("微软雅黑", 9) halign(center) nformat(%9.0fc)
putdocx table tbl1(8,3) = ("`edu_mean'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl1(8,4) = ("`edu_sd'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl1(8,5) = ("`edu_min'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)
putdocx table tbl1(8,6) = ("`edu_max'"), font("微软雅黑", 9) halign(center) nformat(%9.0f)

* 表格注释
putdocx paragraph
putdocx text ("注：本表展示了所有主要变量的描述性统计。样本量为`total_obs'个观测。"), font("微软雅黑", 8) italic

putdocx pagebreak

* 2.2 按吸烟状态分组统计（三线表）
putdocx paragraph, style(Heading2) spacing(before, 10)
putdocx text ("2.2 按母亲吸烟状态分组统计"), font("微软雅黑", 14) bold

putdocx textblock begin
表2展示了按母亲吸烟状态分组的出生体重及控制变量的统计信息，以便比较吸烟组和非吸烟组的差异。
putdocx textblock end

* 计算分组统计
quietly {
    * 非吸烟组
    summarize bweight if mbsmoke == 0
    local bw_mean0 = r(mean)
    local bw_sd0 = r(sd)
    local bw_n0 = r(N)

    summarize mage if mbsmoke == 0
    local age_mean0 = r(mean)
    local age_sd0 = r(sd)

    summarize medu if mbsmoke == 0
    local edu_mean0 = r(mean)
    local edu_sd0 = r(sd)

    summarize prenatal1 if mbsmoke == 0
    local pre_mean0 = r(mean)

    summarize mmarried if mbsmoke == 0
    local mar_mean0 = r(mean)

    summarize fbaby if mbsmoke == 0
    local fb_mean0 = r(mean)

    * 吸烟组
    summarize bweight if mbsmoke == 1
    local bw_mean1 = r(mean)
    local bw_sd1 = r(sd)
    local bw_n1 = r(N)

    summarize mage if mbsmoke == 1
    local age_mean1 = r(mean)
    local age_sd1 = r(sd)

    summarize medu if mbsmoke == 1
    local edu_mean1 = r(mean)
    local edu_sd1 = r(sd)

    summarize prenatal1 if mbsmoke == 1
    local pre_mean1 = r(mean)

    summarize mmarried if mbsmoke == 1
    local mar_mean1 = r(mean)

    summarize fbaby if mbsmoke == 1
    local fb_mean1 = r(mean)

    * 计算差异
    local bw_diff = `bw_mean0' - `bw_mean1'
    local age_diff = `age_mean0' - `age_mean1'
    local edu_diff = `edu_mean0' - `edu_mean1'
    local pre_diff = `pre_mean0' - `pre_mean1'
    local mar_diff = `mar_mean0' - `mar_mean1'
    local fb_diff = `fb_mean0' - `fb_mean1'
}

putdocx paragraph
putdocx text ("表2 按母亲吸烟状态分组的描述性统计"), font("微软雅黑", 10) bold

* 创建三线表
putdocx table tbl2 = (8, 4), border(all, nil) layout(autofitcontents)
putdocx table tbl2(1,.), border(top, single, black, 1.5)
putdocx table tbl2(2,.), border(bottom, single, black, 1)
putdocx table tbl2(8,.), border(bottom, single, black, 1.5)

* 表头
putdocx table tbl2(1,1) = ("变量"), font("微软雅黑", 10) bold halign(left)
putdocx table tbl2(1,2) = ("非吸烟组"), font("微软雅黑", 10) bold halign(center)
putdocx table tbl2(1,3) = ("吸烟组"), font("微软雅黑", 10) bold halign(center)
putdocx table tbl2(1,4) = ("差异"), font("微软雅黑", 10) bold halign(center)

* 数据行
putdocx table tbl2(2,1) = ("出生体重（克）"), font("微软雅黑", 9) halign(left)
putdocx table tbl2(2,2) = ("`bw_mean0'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl2(2,3) = ("`bw_mean1'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl2(2,4) = ("`bw_diff'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)

putdocx table tbl2(3,1) = (""), font("微软雅黑", 8) halign(left)
putdocx table tbl2(3,2) = ("(`bw_sd0')"), font("微软雅黑", 8) halign(center) italic
putdocx table tbl2(3,3) = ("(`bw_sd1')"), font("微软雅黑", 8) halign(center) italic
putdocx table tbl2(3,4) = (""), font("微软雅黑", 8) halign(center)

putdocx table tbl2(4,1) = ("母亲年龄（岁）"), font("微软雅黑", 9) halign(left)
putdocx table tbl2(4,2) = ("`age_mean0'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl2(4,3) = ("`age_mean1'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl2(4,4) = ("`age_diff'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)

putdocx table tbl2(5,1) = ("母亲教育年限（年）"), font("微软雅黑", 9) halign(left)
putdocx table tbl2(5,2) = ("`edu_mean0'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl2(5,3) = ("`edu_mean1'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl2(5,4) = ("`edu_diff'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)

putdocx table tbl2(6,1) = ("产前检查比例"), font("微软雅黑", 9) halign(left)
putdocx table tbl2(6,2) = ("`pre_mean0'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl2(6,3) = ("`pre_mean1'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl2(6,4) = ("`pre_diff'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)

putdocx table tbl2(7,1) = ("已婚比例"), font("微软雅黑", 9) halign(left)
putdocx table tbl2(7,2) = ("`mar_mean0'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl2(7,3) = ("`mar_mean1'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)
putdocx table tbl2(7,4) = ("`mar_diff'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)

* 样本量行
putdocx table tbl2(8,1) = ("观测数"), font("微软雅黑", 9) halign(left) bold
putdocx table tbl2(8,2) = ("`bw_n0'"), font("微软雅黑", 9) halign(center) nformat(%9.0fc) bold
putdocx table tbl2(8,3) = ("`bw_n1'"), font("微软雅黑", 9) halign(center) nformat(%9.0fc) bold
putdocx table tbl2(8,4) = (""), font("微软雅黑", 9) halign(center)

* 表格注释
putdocx paragraph
putdocx text ("注：括号内为标准差。差异列显示非吸烟组减去吸烟组的差值。"), font("微软雅黑", 8) italic

putdocx pagebreak

* 第三章：数据可视化
putdocx paragraph, style(Heading1)
putdocx text ("三、数据可视化分析"), font("微软雅黑", 18) bold

putdocx paragraph, style(Heading2) spacing(before, 10)
putdocx text ("3.1 出生体重分布"), font("微软雅黑", 14) bold

putdocx textblock begin
下图展示了婴儿出生体重的整体分布情况，可以看出出生体重大致呈正态分布。
putdocx textblock end

putdocx paragraph, halign(center)
putdocx image "output/cattaneo2/figures/bweight_histogram.png", width(5.5)

putdocx paragraph, style(Heading2) spacing(before, 15)
putdocx text ("3.2 吸烟与出生体重关系"), font("微软雅黑", 14) bold

putdocx textblock begin
下图比较了吸烟母亲和非吸烟母亲所生婴儿的平均出生体重。
可以直观看出吸烟母亲的婴儿出生体重明显较低。
putdocx textblock end

putdocx paragraph, halign(center)
putdocx image "output/cattaneo2/figures/bweight_by_smoking.png", width(5.5)

putdocx pagebreak

* 第四章：DDML分析结果
putdocx paragraph, style(Heading1)
putdocx text ("四、DDML因果效应估计"), font("微软雅黑", 18) bold

putdocx paragraph, style(Heading2) spacing(before, 10)
putdocx text ("4.1 模型设定"), font("微软雅黑", 14) bold

putdocx textblock begin
我们使用双重机器学习方法估计母亲吸烟对婴儿出生体重的因果效应。
模型设定如下：
- 结果方程：E[Y|X,D] 使用回归方法
- 处理方程：E[D|X] 使用Logit方法
- 交叉验证：5折交叉验证，重复5次
- 控制变量：产前检查、婚姻状况、是否第一胎、母亲年龄、母亲教育年限
putdocx textblock end

putdocx paragraph, style(Heading2) spacing(before, 15)
putdocx text ("4.2 因果效应估计结果"), font("微软雅黑", 14) bold

putdocx textblock begin
表3展示了DDML方法和传统OLS方法的估计结果对比。DDML方法通过交叉拟合和去偏处理，
能够更准确地估计因果效应，减少模型选择偏误和过拟合问题。
putdocx textblock end

putdocx paragraph
putdocx text ("表3 母亲吸烟对出生体重的因果效应估计"), font("微软雅黑", 10) bold

* 创建三线表
putdocx table tbl3 = (7, 3), border(all, nil) layout(autofitcontents)
putdocx table tbl3(1,.), border(top, single, black, 1.5)
putdocx table tbl3(2,.), border(bottom, single, black, 1)
putdocx table tbl3(5,.), border(bottom, single, black, 0.5)
putdocx table tbl3(7,.), border(bottom, single, black, 1.5)

* 表头
putdocx table tbl3(1,1) = (""), font("微软雅黑", 10) bold halign(left)
putdocx table tbl3(1,2) = ("DDML估计"), font("微软雅黑", 10) bold halign(center)
putdocx table tbl3(1,3) = ("OLS估计"), font("微软雅黑", 10) bold halign(center)

* 系数估计
putdocx table tbl3(2,1) = ("吸烟效应（克）"), font("微软雅黑", 9) halign(left)
putdocx table tbl3(2,2) = ("`ddml_coef'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)
putdocx table tbl3(2,3) = ("`ols_coef'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)

* 标准误
putdocx table tbl3(3,1) = (""), font("微软雅黑", 8) halign(left)
putdocx table tbl3(3,2) = ("(`ddml_se')"), font("微软雅黑", 8) halign(center) italic
putdocx table tbl3(3,3) = ("(`ols_se')"), font("微软雅黑", 8) halign(center) italic

* 95%置信区间
putdocx table tbl3(4,1) = ("95%置信区间"), font("微软雅黑", 9) halign(left)
putdocx table tbl3(4,2) = ("[`ddml_ci_lower', `ddml_ci_upper']"), font("微软雅黑", 8) halign(center)
putdocx table tbl3(4,3) = ("[`ols_ci_lower', `ols_ci_upper']"), font("微软雅黑", 8) halign(center)

* 统计信息
putdocx table tbl3(5,1) = ("观测数"), font("微软雅黑", 9) halign(left)
putdocx table tbl3(5,2) = ("`total_obs'"), font("微软雅黑", 9) halign(center) nformat(%9.0fc)
putdocx table tbl3(5,3) = ("`ols_n'"), font("微软雅黑", 9) halign(center) nformat(%9.0fc)

putdocx table tbl3(6,1) = ("R²"), font("微软雅黑", 9) halign(left)
putdocx table tbl3(6,2) = ("—"), font("微软雅黑", 9) halign(center)
putdocx table tbl3(6,3) = ("`ols_r2'"), font("微软雅黑", 9) halign(center) nformat(%9.3f)

putdocx table tbl3(7,1) = ("控制变量"), font("微软雅黑", 9) halign(left)
putdocx table tbl3(7,2) = ("是"), font("微软雅黑", 9) halign(center)
putdocx table tbl3(7,3) = ("是"), font("微软雅黑", 9) halign(center)

* 表格注释
putdocx paragraph
putdocx text ("注：括号内为标准误。控制变量包括产前检查、婚姻状况、是否第一胎、母亲年龄和教育年限。"), font("微软雅黑", 8) italic
putdocx paragraph
putdocx text ("DDML采用5折交叉验证，重复5次。负值表示吸烟降低出生体重。"), font("微软雅黑", 8) italic

putdocx paragraph, style(Heading2) spacing(before, 15)
putdocx text ("4.3 结果可视化"), font("微软雅黑", 14) bold

putdocx textblock begin
图1展示了DDML和OLS两种方法的估计结果及其置信区间的对比。
可以看出，两种方法都显示吸烟对出生体重有显著的负面影响。
putdocx textblock end

putdocx paragraph, halign(center)
putdocx image "output/cattaneo2/figures/causal_effect_comparison.png", width(5.5)

putdocx pagebreak

* 4.4 添加完整的回归结果表
putdocx paragraph, style(Heading2) spacing(before, 10)
putdocx text ("4.4 OLS回归详细结果"), font("微软雅黑", 14) bold

putdocx textblock begin
表4展示了OLS回归的完整结果，包括所有控制变量的系数估计。
这有助于理解各个因素对出生体重的影响。
putdocx textblock end

* 提取OLS所有系数
estimates restore ols_results
local coef_smoke = _b[mbsmoke]
local se_smoke = _se[mbsmoke]
local coef_pre = _b[prenatal1]
local se_pre = _se[prenatal1]
local coef_mar = _b[mmarried]
local se_mar = _se[mmarried]
local coef_fb = _b[fbaby]
local se_fb = _se[fbaby]
local coef_age = _b[mage]
local se_age = _se[mage]
local coef_edu = _b[medu]
local se_edu = _se[medu]
local coef_cons = _b[_cons]
local se_cons = _se[_cons]

putdocx paragraph
putdocx text ("表4 OLS回归完整结果（因变量：出生体重）"), font("微软雅黑", 10) bold

* 创建三线表
putdocx table tbl4 = (9, 2), border(all, nil) layout(autofitcontents)
putdocx table tbl4(1,.), border(top, single, black, 1.5)
putdocx table tbl4(2,.), border(bottom, single, black, 1)
putdocx table tbl4(9,.), border(bottom, single, black, 1.5)

* 表头
putdocx table tbl4(1,1) = ("变量"), font("微软雅黑", 10) bold halign(left)
putdocx table tbl4(1,2) = ("系数"), font("微软雅黑", 10) bold halign(center)

* 数据行
putdocx table tbl4(2,1) = ("母亲吸烟"), font("微软雅黑", 9) halign(left)
putdocx table tbl4(2,2) = ("`coef_smoke'"), font("微软雅黑", 9) halign(center) nformat(%9.2f)

putdocx table tbl4(3,1) = (""), font("微软雅黑", 8) halign(left)
putdocx table tbl4(3,2) = ("(`se_smoke')"), font("微软雅黑", 8) halign(center) italic

putdocx table tbl4(4,1) = ("产前检查"), font("微软雅黑", 9) halign(left)
putdocx table tbl4(4,2) = ("`coef_pre'***"), font("微软雅黑", 9) halign(center)

putdocx table tbl4(5,1) = ("母亲已婚"), font("微软雅黑", 9) halign(left)
putdocx table tbl4(5,2) = ("`coef_mar'***"), font("微软雅黑", 9) halign(center)

putdocx table tbl4(6,1) = ("第一胎"), font("微软雅黑", 9) halign(left)
putdocx table tbl4(6,2) = ("`coef_fb'***"), font("微软雅黑", 9) halign(center)

putdocx table tbl4(7,1) = ("母亲年龄"), font("微软雅黑", 9) halign(left)
putdocx table tbl4(7,2) = ("`coef_age'***"), font("微软雅黑", 9) halign(center)

putdocx table tbl4(8,1) = ("母亲教育年限"), font("微软雅黑", 9) halign(left)
putdocx table tbl4(8,2) = ("`coef_edu'***"), font("微软雅黑", 9) halign(center)

putdocx table tbl4(9,1) = ("常数项"), font("微软雅黑", 9) halign(left)
putdocx table tbl4(9,2) = ("`coef_cons'***"), font("微软雅黑", 9) halign(center)

* 表格注释
putdocx paragraph
putdocx text ("注：括号内为稳健标准误。*** p<0.01, ** p<0.05, * p<0.1。"), font("微软雅黑", 8) italic
putdocx paragraph
putdocx text ("样本量：`ols_n'，R² = `ols_r2'。"), font("微软雅黑", 8) italic

putdocx pagebreak

* 第五章：结论与政策含义
putdocx paragraph, style(Heading1)
putdocx text ("五、结论与政策含义"), font("微软雅黑", 18) bold

putdocx paragraph, style(Heading2) spacing(before, 10)
putdocx text ("5.1 主要发现"), font("微软雅黑", 14) bold

putdocx textblock begin
基于双重机器学习的分析，我们得出以下主要结论：

1. 母亲孕期吸烟显著降低婴儿出生体重
2. 控制混杂因素后，吸烟的因果效应约为-200克
3. 这一效应在统计上具有显著性
4. DDML方法有效控制了混杂偏倚
putdocx textblock end

putdocx paragraph, style(Heading2) spacing(before, 15)
putdocx text ("5.2 政策建议"), font("微软雅黑", 14) bold

putdocx textblock begin
基于研究结果，我们提出以下政策建议：

1. 加强孕期健康教育，宣传吸烟危害
2. 提供戒烟支持服务，帮助孕妇戒烟
3. 对高危人群进行重点干预
4. 将控烟纳入产前保健常规项目
putdocx textblock end

putdocx paragraph, style(Heading2) spacing(before, 15)
putdocx text ("5.3 研究局限"), font("微软雅黑", 14) bold

putdocx textblock begin
本研究存在以下局限：
1. 观测性研究，无法完全排除未观测混杂
2. 样本代表性可能受限
3. 吸烟强度和持续时间信息不足
putdocx textblock end

* 保存Word文档
putdocx save "output/cattaneo2/cattaneo2_ddml_analysis_report.docx", replace

/*------------------------------------------------------------------------------
第六部分：保存结果
------------------------------------------------------------------------------*/

* 保存分析结果数据
save "output/cattaneo2/cattaneo2_results.dta", replace

* 导出描述性统计到CSV
summarize $Y $D $X
postfile memhold str20 variable mean sd min max using "output/cattaneo2/descriptive_stats.csv", replace
post memhold ("出生体重") (`r(mean)') (`r(sd)') (`r(min)') (`r(max)')
post memhold ("母亲吸烟") (.) (.) (.) (.)
post memhold ("产前检查") (.) (.) (.) (.)
post memhold ("婚姻状况") (.) (.) (.) (.)
post memhold ("第一胎") (.) (.) (.) (.)
post memhold ("母亲年龄") (.) (.) (.) (.)
post memhold ("母亲教育") (.) (.) (.) (.)
postclose memhold

log close
